Data Munging

Data loading and preprocessing with pandas

Fast and easy data loading



In [4]:

    
import pandas as pd
iris_filename = 'datasets-uci-iris.csv'
iris = pd.read_csv(iris_filename, sep=',', decimal='.', header=None,
names= ['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
'target'])



In [5]:

    
# If the dataset is not available online, you can follow these steps to
# download it from the Internet:
try:
    import urllib.request as urllib2
except ImportError:
    import urllib2
#import urllib2
url = "http://aima.cs.berkeley.edu/data/iris.csv"
set1 = urllib2.Request(url)
iris_p = urllib2.urlopen(set1)
iris_other = pd.read_csv(iris_p, sep=',', decimal='.',
header=None, names= ['sepal_length', 'sepal_width',
'petal_length', 'petal_width', 'target'])
iris_other.head()









    Out[5]:






  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      target
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      setosa



In [6]:

    
iris.head()









    Out[6]:






  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      target
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      Iris-setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      Iris-setosa



In [7]:

    
iris.tail()









    Out[7]:






  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      target
    
  
  
    
      145
      6.7
      3.0
      5.2
      2.3
      Iris-virginica
    
    
      146
      6.3
      2.5
      5.0
      1.9
      Iris-virginica
    
    
      147
      6.5
      3.0
      5.2
      2.0
      Iris-virginica
    
    
      148
      6.2
      3.4
      5.4
      2.3
      Iris-virginica
    
    
      149
      5.9
      3.0
      5.1
      1.8
      Iris-virginica



In [8]:

    
iris.head(2)









    Out[8]:






  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      target
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      Iris-setosa



In [9]:

    
iris.columns









    Out[9]:





Index([u'sepal_length', u'sepal_width', u'petal_length', u'petal_width',
       u'target'],
      dtype='object')



In [10]:

    
Y = iris['target']
Y









    Out[10]:





0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
5         Iris-setosa
6         Iris-setosa
7         Iris-setosa
8         Iris-setosa
9         Iris-setosa
10        Iris-setosa
11        Iris-setosa
12        Iris-setosa
13        Iris-setosa
14        Iris-setosa
15        Iris-setosa
16        Iris-setosa
17        Iris-setosa
18        Iris-setosa
19        Iris-setosa
20        Iris-setosa
21        Iris-setosa
22        Iris-setosa
23        Iris-setosa
24        Iris-setosa
25        Iris-setosa
26        Iris-setosa
27        Iris-setosa
28        Iris-setosa
29        Iris-setosa
            ...      
120    Iris-virginica
121    Iris-virginica
122    Iris-virginica
123    Iris-virginica
124    Iris-virginica
125    Iris-virginica
126    Iris-virginica
127    Iris-virginica
128    Iris-virginica
129    Iris-virginica
130    Iris-virginica
131    Iris-virginica
132    Iris-virginica
133    Iris-virginica
134    Iris-virginica
135    Iris-virginica
136    Iris-virginica
137    Iris-virginica
138    Iris-virginica
139    Iris-virginica
140    Iris-virginica
141    Iris-virginica
142    Iris-virginica
143    Iris-virginica
144    Iris-virginica
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: target, dtype: object



In [11]:

    
X = iris[['sepal_length', 'sepal_width']]
X









    Out[11]:






  
    
      
      sepal_length
      sepal_width
    
  
  
    
      0
      5.1
      3.5
    
    
      1
      4.9
      3.0
    
    
      2
      4.7
      3.2
    
    
      3
      4.6
      3.1
    
    
      4
      5.0
      3.6
    
    
      5
      5.4
      3.9
    
    
      6
      4.6
      3.4
    
    
      7
      5.0
      3.4
    
    
      8
      4.4
      2.9
    
    
      9
      4.9
      3.1
    
    
      10
      5.4
      3.7
    
    
      11
      4.8
      3.4
    
    
      12
      4.8
      3.0
    
    
      13
      4.3
      3.0
    
    
      14
      5.8
      4.0
    
    
      15
      5.7
      4.4
    
    
      16
      5.4
      3.9
    
    
      17
      5.1
      3.5
    
    
      18
      5.7
      3.8
    
    
      19
      5.1
      3.8
    
    
      20
      5.4
      3.4
    
    
      21
      5.1
      3.7
    
    
      22
      4.6
      3.6
    
    
      23
      5.1
      3.3
    
    
      24
      4.8
      3.4
    
    
      25
      5.0
      3.0
    
    
      26
      5.0
      3.4
    
    
      27
      5.2
      3.5
    
    
      28
      5.2
      3.4
    
    
      29
      4.7
      3.2
    
    
      ...
      ...
      ...
    
    
      120
      6.9
      3.2
    
    
      121
      5.6
      2.8
    
    
      122
      7.7
      2.8
    
    
      123
      6.3
      2.7
    
    
      124
      6.7
      3.3
    
    
      125
      7.2
      3.2
    
    
      126
      6.2
      2.8
    
    
      127
      6.1
      3.0
    
    
      128
      6.4
      2.8
    
    
      129
      7.2
      3.0
    
    
      130
      7.4
      2.8
    
    
      131
      7.9
      3.8
    
    
      132
      6.4
      2.8
    
    
      133
      6.3
      2.8
    
    
      134
      6.1
      2.6
    
    
      135
      7.7
      3.0
    
    
      136
      6.3
      3.4
    
    
      137
      6.4
      3.1
    
    
      138
      6.0
      3.0
    
    
      139
      6.9
      3.1
    
    
      140
      6.7
      3.1
    
    
      141
      6.9
      3.1
    
    
      142
      5.8
      2.7
    
    
      143
      6.8
      3.2
    
    
      144
      6.7
      3.3
    
    
      145
      6.7
      3.0
    
    
      146
      6.3
      2.5
    
    
      147
      6.5
      3.0
    
    
      148
      6.2
      3.4
    
    
      149
      5.9
      3.0
    
  

150 rows × 2 columns



In [12]:

    
X.shape









    Out[12]:





(150, 2)



In [13]:

    
Y.shape









    Out[13]:





(150,)

Dealing with problematic data



In [14]:

    
import pandas as pd
fake_dataset = pd.read_csv('a_loading_example_1.csv', sep=',')
fake_dataset









    Out[14]:






  
    
      
      Date
      Temperature_city_1
      Temperature_city_2
      Temperature_city_3
      Which_destination
    
  
  
    
      0
      20140910
      80.0
      32.0
      40
      1
    
    
      1
      20140911
      100.0
      50.0
      36
      2
    
    
      2
      20140912
      102.0
      55.0
      46
      1
    
    
      3
      20140913
      60.0
      20.0
      35
      3
    
    
      4
      20140914
      60.0
      NaN
      32
      3
    
    
      5
      20140915
      NaN
      57.0
      42
      2



In [15]:

    
fake_dataset = pd.read_csv('a_loading_example_1.csv',
parse_dates=[0])
fake_dataset









    Out[15]:






  
    
      
      Date
      Temperature_city_1
      Temperature_city_2
      Temperature_city_3
      Which_destination
    
  
  
    
      0
      2014-09-10
      80.0
      32.0
      40
      1
    
    
      1
      2014-09-11
      100.0
      50.0
      36
      2
    
    
      2
      2014-09-12
      102.0
      55.0
      46
      1
    
    
      3
      2014-09-13
      60.0
      20.0
      35
      3
    
    
      4
      2014-09-14
      60.0
      NaN
      32
      3
    
    
      5
      2014-09-15
      NaN
      57.0
      42
      2



In [16]:

    
fake_dataset.fillna(50)









    Out[16]:






  
    
      
      Date
      Temperature_city_1
      Temperature_city_2
      Temperature_city_3
      Which_destination
    
  
  
    
      0
      2014-09-10
      80.0
      32.0
      40
      1
    
    
      1
      2014-09-11
      100.0
      50.0
      36
      2
    
    
      2
      2014-09-12
      102.0
      55.0
      46
      1
    
    
      3
      2014-09-13
      60.0
      20.0
      35
      3
    
    
      4
      2014-09-14
      60.0
      50.0
      32
      3
    
    
      5
      2014-09-15
      50.0
      57.0
      42
      2



In [17]:

    
fake_dataset.fillna(-1)









    Out[17]:






  
    
      
      Date
      Temperature_city_1
      Temperature_city_2
      Temperature_city_3
      Which_destination
    
  
  
    
      0
      2014-09-10
      80.0
      32.0
      40
      1
    
    
      1
      2014-09-11
      100.0
      50.0
      36
      2
    
    
      2
      2014-09-12
      102.0
      55.0
      46
      1
    
    
      3
      2014-09-13
      60.0
      20.0
      35
      3
    
    
      4
      2014-09-14
      60.0
      -1.0
      32
      3
    
    
      5
      2014-09-15
      -1.0
      57.0
      42
      2



In [18]:

    
fake_dataset.fillna(fake_dataset.mean(axis=0))









    Out[18]:






  
    
      
      Date
      Temperature_city_1
      Temperature_city_2
      Temperature_city_3
      Which_destination
    
  
  
    
      0
      2014-09-10
      80.0
      32.0
      40
      1
    
    
      1
      2014-09-11
      100.0
      50.0
      36
      2
    
    
      2
      2014-09-12
      102.0
      55.0
      46
      1
    
    
      3
      2014-09-13
      60.0
      20.0
      35
      3
    
    
      4
      2014-09-14
      60.0
      42.8
      32
      3
    
    
      5
      2014-09-15
      80.4
      57.0
      42
      2



In [19]:

    
bad_dataset = pd.read_csv('a_loading_example_2.csv',
error_bad_lines=False)









    



Skipping line 4: expected 3 fields, saw 4

Dealing with big datasets



In [20]:

    
import pandas as pd
iris_chunks = pd.read_csv(iris_filename, header=None,
names=['C1', 'C2', 'C3', 'C4', 'C5'], chunksize=10)
for chunk in iris_chunks:
    print chunk.shape
    print chunk









    



(10, 5)
    C1   C2   C3   C4           C5
0  5.1  3.5  1.4  0.2  Iris-setosa
1  4.9  3.0  1.4  0.2  Iris-setosa
2  4.7  3.2  1.3  0.2  Iris-setosa
3  4.6  3.1  1.5  0.2  Iris-setosa
4  5.0  3.6  1.4  0.2  Iris-setosa
5  5.4  3.9  1.7  0.4  Iris-setosa
6  4.6  3.4  1.4  0.3  Iris-setosa
7  5.0  3.4  1.5  0.2  Iris-setosa
8  4.4  2.9  1.4  0.2  Iris-setosa
9  4.9  3.1  1.5  0.1  Iris-setosa
(10, 5)
    C1   C2   C3   C4           C5
0  5.4  3.7  1.5  0.2  Iris-setosa
1  4.8  3.4  1.6  0.2  Iris-setosa
2  4.8  3.0  1.4  0.1  Iris-setosa
3  4.3  3.0  1.1  0.1  Iris-setosa
4  5.8  4.0  1.2  0.2  Iris-setosa
5  5.7  4.4  1.5  0.4  Iris-setosa
6  5.4  3.9  1.3  0.4  Iris-setosa
7  5.1  3.5  1.4  0.3  Iris-setosa
8  5.7  3.8  1.7  0.3  Iris-setosa
9  5.1  3.8  1.5  0.3  Iris-setosa
(10, 5)
    C1   C2   C3   C4           C5
0  5.4  3.4  1.7  0.2  Iris-setosa
1  5.1  3.7  1.5  0.4  Iris-setosa
2  4.6  3.6  1.0  0.2  Iris-setosa
3  5.1  3.3  1.7  0.5  Iris-setosa
4  4.8  3.4  1.9  0.2  Iris-setosa
5  5.0  3.0  1.6  0.2  Iris-setosa
6  5.0  3.4  1.6  0.4  Iris-setosa
7  5.2  3.5  1.5  0.2  Iris-setosa
8  5.2  3.4  1.4  0.2  Iris-setosa
9  4.7  3.2  1.6  0.2  Iris-setosa
(10, 5)
    C1   C2   C3   C4           C5
0  4.8  3.1  1.6  0.2  Iris-setosa
1  5.4  3.4  1.5  0.4  Iris-setosa
2  5.2  4.1  1.5  0.1  Iris-setosa
3  5.5  4.2  1.4  0.2  Iris-setosa
4  4.9  3.1  1.5  0.1  Iris-setosa
5  5.0  3.2  1.2  0.2  Iris-setosa
6  5.5  3.5  1.3  0.2  Iris-setosa
7  4.9  3.1  1.5  0.1  Iris-setosa
8  4.4  3.0  1.3  0.2  Iris-setosa
9  5.1  3.4  1.5  0.2  Iris-setosa
(10, 5)
    C1   C2   C3   C4           C5
0  5.0  3.5  1.3  0.3  Iris-setosa
1  4.5  2.3  1.3  0.3  Iris-setosa
2  4.4  3.2  1.3  0.2  Iris-setosa
3  5.0  3.5  1.6  0.6  Iris-setosa
4  5.1  3.8  1.9  0.4  Iris-setosa
5  4.8  3.0  1.4  0.3  Iris-setosa
6  5.1  3.8  1.6  0.2  Iris-setosa
7  4.6  3.2  1.4  0.2  Iris-setosa
8  5.3  3.7  1.5  0.2  Iris-setosa
9  5.0  3.3  1.4  0.2  Iris-setosa
(10, 5)
    C1   C2   C3   C4               C5
0  7.0  3.2  4.7  1.4  Iris-versicolor
1  6.4  3.2  4.5  1.5  Iris-versicolor
2  6.9  3.1  4.9  1.5  Iris-versicolor
3  5.5  2.3  4.0  1.3  Iris-versicolor
4  6.5  2.8  4.6  1.5  Iris-versicolor
5  5.7  2.8  4.5  1.3  Iris-versicolor
6  6.3  3.3  4.7  1.6  Iris-versicolor
7  4.9  2.4  3.3  1.0  Iris-versicolor
8  6.6  2.9  4.6  1.3  Iris-versicolor
9  5.2  2.7  3.9  1.4  Iris-versicolor
(10, 5)
    C1   C2   C3   C4               C5
0  5.0  2.0  3.5  1.0  Iris-versicolor
1  5.9  3.0  4.2  1.5  Iris-versicolor
2  6.0  2.2  4.0  1.0  Iris-versicolor
3  6.1  2.9  4.7  1.4  Iris-versicolor
4  5.6  2.9  3.6  1.3  Iris-versicolor
5  6.7  3.1  4.4  1.4  Iris-versicolor
6  5.6  3.0  4.5  1.5  Iris-versicolor
7  5.8  2.7  4.1  1.0  Iris-versicolor
8  6.2  2.2  4.5  1.5  Iris-versicolor
9  5.6  2.5  3.9  1.1  Iris-versicolor
(10, 5)
    C1   C2   C3   C4               C5
0  5.9  3.2  4.8  1.8  Iris-versicolor
1  6.1  2.8  4.0  1.3  Iris-versicolor
2  6.3  2.5  4.9  1.5  Iris-versicolor
3  6.1  2.8  4.7  1.2  Iris-versicolor
4  6.4  2.9  4.3  1.3  Iris-versicolor
5  6.6  3.0  4.4  1.4  Iris-versicolor
6  6.8  2.8  4.8  1.4  Iris-versicolor
7  6.7  3.0  5.0  1.7  Iris-versicolor
8  6.0  2.9  4.5  1.5  Iris-versicolor
9  5.7  2.6  3.5  1.0  Iris-versicolor
(10, 5)
    C1   C2   C3   C4               C5
0  5.5  2.4  3.8  1.1  Iris-versicolor
1  5.5  2.4  3.7  1.0  Iris-versicolor
2  5.8  2.7  3.9  1.2  Iris-versicolor
3  6.0  2.7  5.1  1.6  Iris-versicolor
4  5.4  3.0  4.5  1.5  Iris-versicolor
5  6.0  3.4  4.5  1.6  Iris-versicolor
6  6.7  3.1  4.7  1.5  Iris-versicolor
7  6.3  2.3  4.4  1.3  Iris-versicolor
8  5.6  3.0  4.1  1.3  Iris-versicolor
9  5.5  2.5  4.0  1.3  Iris-versicolor
(10, 5)
    C1   C2   C3   C4               C5
0  5.5  2.6  4.4  1.2  Iris-versicolor
1  6.1  3.0  4.6  1.4  Iris-versicolor
2  5.8  2.6  4.0  1.2  Iris-versicolor
3  5.0  2.3  3.3  1.0  Iris-versicolor
4  5.6  2.7  4.2  1.3  Iris-versicolor
5  5.7  3.0  4.2  1.2  Iris-versicolor
6  5.7  2.9  4.2  1.3  Iris-versicolor
7  6.2  2.9  4.3  1.3  Iris-versicolor
8  5.1  2.5  3.0  1.1  Iris-versicolor
9  5.7  2.8  4.1  1.3  Iris-versicolor
(10, 5)
    C1   C2   C3   C4              C5
0  6.3  3.3  6.0  2.5  Iris-virginica
1  5.8  2.7  5.1  1.9  Iris-virginica
2  7.1  3.0  5.9  2.1  Iris-virginica
3  6.3  2.9  5.6  1.8  Iris-virginica
4  6.5  3.0  5.8  2.2  Iris-virginica
5  7.6  3.0  6.6  2.1  Iris-virginica
6  4.9  2.5  4.5  1.7  Iris-virginica
7  7.3  2.9  6.3  1.8  Iris-virginica
8  6.7  2.5  5.8  1.8  Iris-virginica
9  7.2  3.6  6.1  2.5  Iris-virginica
(10, 5)
    C1   C2   C3   C4              C5
0  6.5  3.2  5.1  2.0  Iris-virginica
1  6.4  2.7  5.3  1.9  Iris-virginica
2  6.8  3.0  5.5  2.1  Iris-virginica
3  5.7  2.5  5.0  2.0  Iris-virginica
4  5.8  2.8  5.1  2.4  Iris-virginica
5  6.4  3.2  5.3  2.3  Iris-virginica
6  6.5  3.0  5.5  1.8  Iris-virginica
7  7.7  3.8  6.7  2.2  Iris-virginica
8  7.7  2.6  6.9  2.3  Iris-virginica
9  6.0  2.2  5.0  1.5  Iris-virginica
(10, 5)
    C1   C2   C3   C4              C5
0  6.9  3.2  5.7  2.3  Iris-virginica
1  5.6  2.8  4.9  2.0  Iris-virginica
2  7.7  2.8  6.7  2.0  Iris-virginica
3  6.3  2.7  4.9  1.8  Iris-virginica
4  6.7  3.3  5.7  2.1  Iris-virginica
5  7.2  3.2  6.0  1.8  Iris-virginica
6  6.2  2.8  4.8  1.8  Iris-virginica
7  6.1  3.0  4.9  1.8  Iris-virginica
8  6.4  2.8  5.6  2.1  Iris-virginica
9  7.2  3.0  5.8  1.6  Iris-virginica
(10, 5)
    C1   C2   C3   C4              C5
0  7.4  2.8  6.1  1.9  Iris-virginica
1  7.9  3.8  6.4  2.0  Iris-virginica
2  6.4  2.8  5.6  2.2  Iris-virginica
3  6.3  2.8  5.1  1.5  Iris-virginica
4  6.1  2.6  5.6  1.4  Iris-virginica
5  7.7  3.0  6.1  2.3  Iris-virginica
6  6.3  3.4  5.6  2.4  Iris-virginica
7  6.4  3.1  5.5  1.8  Iris-virginica
8  6.0  3.0  4.8  1.8  Iris-virginica
9  6.9  3.1  5.4  2.1  Iris-virginica
(10, 5)
    C1   C2   C3   C4              C5
0  6.7  3.1  5.6  2.4  Iris-virginica
1  6.9  3.1  5.1  2.3  Iris-virginica
2  5.8  2.7  5.1  1.9  Iris-virginica
3  6.8  3.2  5.9  2.3  Iris-virginica
4  6.7  3.3  5.7  2.5  Iris-virginica
5  6.7  3.0  5.2  2.3  Iris-virginica
6  6.3  2.5  5.0  1.9  Iris-virginica
7  6.5  3.0  5.2  2.0  Iris-virginica
8  6.2  3.4  5.4  2.3  Iris-virginica
9  5.9  3.0  5.1  1.8  Iris-virginica



In [21]:

    
iris_iterator = pd.read_csv(iris_filename, header=None,
names=['C1', 'C2', 'C3', 'C4', 'C5'], iterator=True)



In [22]:

    
print iris_iterator.get_chunk(10).shape



In [23]:

    
print iris_iterator.get_chunk(20).shape



In [24]:

    
piece = iris_iterator.get_chunk(2)
piece









    Out[24]:






  
    
      
      C1
      C2
      C3
      C4
      C5
    
  
  
    
      0
      4.8
      3.1
      1.6
      0.2
      Iris-setosa
    
    
      1
      5.4
      3.4
      1.5
      0.4
      Iris-setosa



In [25]:

    
import csv
with open(iris_filename, 'rb') as data_stream:
    for n, row in enumerate(csv.DictReader(data_stream,
        fieldnames = ['sepal_length', 'sepal_width',
        'petal_length', 'petal_width', 'target'],
        dialect='excel')):
            if n== 0:
                print n,row
            else:
                break









    



0 {'sepal_width': '3.5', 'petal_width': '0.2', 'target': 'Iris-setosa', 'sepal_length': '5.1', 'petal_length': '1.4'}



In [26]:

    
with open(iris_filename, 'rb') as data_stream:
    for n, row in enumerate(csv.reader(data_stream,
        dialect='excel')):
            if n==0:
                print row
            else:
                break









    



['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']



In [27]:

    
def batch_read(filename, batch=5):
    # open the data stream
    with open(filename, 'rb') as data_stream:
        # reset the batch
        batch_output = list()
        # iterate over the file
        for n, row in enumerate(csv.reader(data_stream, dialect='excel')):
            # if the batch is of the right size
            if n > 0 and n % batch == 0:
                # yield back the batch as an ndarray
                yield(np.array(batch_output))
                # reset the batch and restart
                batch_output = list()
            # otherwise add the row to the batch
            batch_output.append(row)
        # when the loop is over, yield what's left
        yield(np.array(batch_output))



In [28]:

    
import numpy as np
for batch_input in batch_read(iris_filename, batch=3):
    print batch_input
    break









    



[['5.1' '3.5' '1.4' '0.2' 'Iris-setosa']
 ['4.9' '3.0' '1.4' '0.2' 'Iris-setosa']
 ['4.7' '3.2' '1.3' '0.2' 'Iris-setosa']]

Accessing other data formats



In [29]:

    
import pandas as pd
my_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':
[1.0]*5, 'Col3': 1.0, 'Col4': 'Hello World!'})
my_own_dataset









    Out[29]:






  
    
      
      Col1
      Col2
      Col3
      Col4
    
  
  
    
      0
      0
      1.0
      1.0
      Hello World!
    
    
      1
      1
      1.0
      1.0
      Hello World!
    
    
      2
      2
      1.0
      1.0
      Hello World!
    
    
      3
      3
      1.0
      1.0
      Hello World!
    
    
      4
      4
      1.0
      1.0
      Hello World!



In [30]:

    
my_wrong_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':
'string', 'Col3': range(2)})









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-30-472dc6b904c3> in <module>()
      1 my_wrong_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':
----> 2 'string', 'Col3': range(2)})

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    222                                  dtype=dtype, copy=copy)
    223         elif isinstance(data, dict):
--> 224             mgr = self._init_dict(data, index, columns, dtype=dtype)
    225         elif isinstance(data, ma.MaskedArray):
    226             import numpy.ma.mrecords as mrecords

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _init_dict(self, data, index, columns, dtype)
    358             arrays = [data[k] for k in keys]
    359 
--> 360         return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    361 
    362     def _init_ndarray(self, values, index, columns, dtype=None, copy=False):

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   5229     # figure out the index, if necessary
   5230     if index is None:
-> 5231         index = extract_index(arrays)
   5232     else:
   5233         index = _ensure_index(index)

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in extract_index(data)
   5277             lengths = list(set(raw_lengths))
   5278             if len(lengths) > 1:
-> 5279                 raise ValueError('arrays must all be same length')
   5280 
   5281             if have_dicts:

ValueError: arrays must all be same length



In [31]:

    
my_own_dataset.dtypes









    Out[31]:





Col1      int64
Col2    float64
Col3    float64
Col4     object
dtype: object



In [32]:

    
my_own_dataset['Col1'] = my_own_dataset['Col1'].astype(float)
my_own_dataset.dtypes









    Out[32]:





Col1    float64
Col2    float64
Col3    float64
Col4     object
dtype: object

Data preprocessing



In [33]:

    
mask_feature = iris['sepal_length'] > 6.0
mask_feature









    Out[33]:





0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
120     True
121    False
122     True
123     True
124     True
125     True
126     True
127     True
128     True
129     True
130     True
131     True
132     True
133     True
134     True
135     True
136     True
137     True
138    False
139     True
140     True
141     True
142    False
143     True
144     True
145     True
146     True
147     True
148     True
149    False
Name: sepal_length, dtype: bool



In [34]:

    
mask_target = iris['target'] == 'Iris-virginica'



In [35]:

    
iris.loc[mask_target, 'target'] = 'New label'



In [36]:

    
iris['target'].unique()









    Out[36]:





array(['Iris-setosa', 'Iris-versicolor', 'New label'], dtype=object)



In [37]:

    
grouped_targets_mean = iris.groupby(['target']).mean()
grouped_targets_mean









    Out[37]:






  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
    
    
      target
      
      
      
      
    
  
  
    
      Iris-setosa
      5.006
      3.418
      1.464
      0.244
    
    
      Iris-versicolor
      5.936
      2.770
      4.260
      1.326
    
    
      New label
      6.588
      2.974
      5.552
      2.026



In [38]:

    
grouped_targets_var = iris.groupby(['target']).var()
grouped_targets_var









    Out[38]:






  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
    
    
      target
      
      
      
      
    
  
  
    
      Iris-setosa
      0.124249
      0.145180
      0.030106
      0.011494
    
    
      Iris-versicolor
      0.266433
      0.098469
      0.220816
      0.039106
    
    
      New label
      0.404343
      0.104004
      0.304588
      0.075433



In [39]:

    
iris.sort_index(by='sepal_length').head()









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
  if __name__ == '__main__':






    Out[39]:






  
    
      
      sepal_length
      sepal_width
      petal_length
      petal_width
      target
    
  
  
    
      13
      4.3
      3.0
      1.1
      0.1
      Iris-setosa
    
    
      42
      4.4
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      38
      4.4
      3.0
      1.3
      0.2
      Iris-setosa
    
    
      8
      4.4
      2.9
      1.4
      0.2
      Iris-setosa
    
    
      41
      4.5
      2.3
      1.3
      0.3
      Iris-setosa



In [40]:

    
# This is just an example, with no time_series data
# smooth_time_series = pd.rolling_mean(time_series, 5)



In [41]:

    
# This is just an example, with no time_series data
# median_time_series = pd.rolling_median(time_series, 5)

Data selection



In [42]:

    
import pandas as pd
dataset = pd.read_csv('a_selection_example_1.csv')
dataset



In [43]:

    
dataset = pd.read_csv('a_selection_example_1.csv', index_col=0)
dataset



In [44]:

    
dataset['val3'][104]









    Out[44]:





'A'



In [45]:

    
dataset.loc[104, 'val3']









    Out[45]:





'A'



In [46]:

    
dataset.ix[104, 'val3']









    Out[46]:





'A'



In [47]:

    
dataset.ix[104, 2]









    Out[47]:





'A'



In [48]:

    
dataset.iloc[4, 2]









    Out[48]:





'A'



In [49]:

    
dataset[['val3', 'val2']][0:2]



In [50]:

    
dataset.loc[range(100, 102), ['val3', 'val2']]



In [51]:

    
dataset.ix[range(100, 102), ['val3', 'val2']]



In [52]:

    
dataset.ix[range(100, 102), [2, 1]]



In [53]:

    
dataset.iloc[range(2), [2,1]]

Working with categorical and textual data



In [54]:

    
import pandas as pd
categorical_feature = pd.Series(['sunny', 'cloudy', 'snowy',
'rainy', 'foggy'])
mapping = pd.get_dummies(categorical_feature)
mapping



In [55]:

    
mapping['sunny']









    Out[55]:





0    1.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: sunny, dtype: float64



In [56]:

    
mapping['cloudy']









    Out[56]:





0    0.0
1    1.0
2    0.0
3    0.0
4    0.0
Name: cloudy, dtype: float64



In [57]:

    
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ohe = OneHotEncoder()
levels = ['sunny', 'cloudy', 'snowy', 'rainy', 'foggy']
fit_levs = le.fit_transform(levels)
ohe.fit([[fit_levs[0]], [fit_levs[1]], [fit_levs[2]], [fit_levs[3]],
[fit_levs[4]]])
print ohe.transform([le.transform(['sunny'])]).toarray()
print ohe.transform([le.transform(['cloudy'])]).toarray()









    



[[ 0.  0.  0.  0.  1.]]
[[ 1.  0.  0.  0.  0.]]

A special type of data–text



In [60]:

    
from sklearn.datasets import fetch_20newsgroups
categories = ['sci.med', 'sci.space']
twenty_sci_news = fetch_20newsgroups(categories=categories)









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-60-e8ca55a56690> in <module>()
      1 from sklearn.datasets import fetch_20newsgroups
      2 categories = ['sci.med', 'sci.space']
----> 3 twenty_sci_news = fetch_20newsgroups(categories=categories)

/home/moonbury/.local/lib/python2.7/site-packages/sklearn/datasets/twenty_newsgroups.pyc in fetch_20newsgroups(data_home, subset, categories, shuffle, random_state, remove, download_if_missing)
    221         if download_if_missing:
    222             cache = download_20newsgroups(target_dir=twenty_home,
--> 223                                           cache_path=cache_path)
    224         else:
    225             raise IOError('20Newsgroups dataset not found')

/home/moonbury/.local/lib/python2.7/site-packages/sklearn/datasets/twenty_newsgroups.pyc in download_20newsgroups(target_dir, cache_path)
     94 
     95     logger.info("Decompressing %s", archive_path)
---> 96     tarfile.open(archive_path, "r:gz").extractall(path=target_dir)
     97     os.remove(archive_path)
     98 

/usr/lib/python2.7/tarfile.pyc in extractall(self, path, members)
   2049                 tarinfo = copy.copy(tarinfo)
   2050                 tarinfo.mode = 0700
-> 2051             self.extract(tarinfo, path)
   2052 
   2053         # Reverse sort directories.

/usr/lib/python2.7/tarfile.pyc in extract(self, member, path)
   2086 
   2087         try:
-> 2088             self._extract_member(tarinfo, os.path.join(path, tarinfo.name))
   2089         except EnvironmentError, e:
   2090             if self.errorlevel > 0:

/usr/lib/python2.7/tarfile.pyc in _extract_member(self, tarinfo, targetpath)
   2179         if not tarinfo.issym():
   2180             self.chmod(tarinfo, targetpath)
-> 2181             self.utime(tarinfo, targetpath)
   2182 
   2183     #--------------------------------------------------------------------------

/usr/lib/python2.7/tarfile.pyc in utime(self, tarinfo, targetpath)
   2300             return
   2301         try:
-> 2302             os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
   2303         except EnvironmentError, e:
   2304             raise ExtractError("could not change modification time")

KeyboardInterrupt:



In [59]:

    
twenty_sci_news.data[0]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-59-34ca0c8e66a8> in <module>()
----> 1 twenty_sci_news.data[0]

NameError: name 'twenty_sci_news' is not defined



In [ ]:

    
twenty_sci_news.filenames



In [ ]:

    
print twenty_sci_news.target[0]
print twenty_sci_news.target_names[twenty_sci_news.target[0]]



In [ ]:

    
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
word_count = count_vect.fit_transform(twenty_sci_news.data)
word_count.shape



In [ ]:

    
print word_count[0]



In [ ]:

    
word_list = count_vect.get_feature_names()
for n in word_count[0].indices:
    print "Word:", word_list[n], "appears", word_count[0, n], "times"



In [ ]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vect = TfidfVectorizer(use_idf=False, norm='l1')
word_freq = tf_vect.fit_transform(twenty_sci_news.data)
word_list = tf_vect.get_feature_names()
for n in word_freq[0].indices:
    print "Word:", word_list[n], "has frequency", word_freq[0, n]



In [ ]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer() # Default: use_idf=True
word_tfidf = tfidf_vect.fit_transform(twenty_sci_news.data)
word_list = tfidf_vect.get_feature_names()
for n in word_tfidf[0].indices:
    print "Word:", word_list[n], "has tfidf", word_tfidf[0, n]



In [ ]:

    
text_1 = 'we love data science'
text_2 = 'data science is hard'
documents = [text_1, text_2]
documents



In [ ]:

    
# That is what we say above, the default one
count_vect_1_grams = CountVectorizer(ngram_range=(1, 1),
stop_words=[], min_df=1)
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print "Word list = ", word_list
print "text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices]



In [ ]:

    
# Now a bi-gram count vectorizer
count_vect_1_grams = CountVectorizer(ngram_range=(2, 2))
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print "Word list = ", word_list
print "text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices]



In [ ]:

    
# Now a uni- and bi-gram count vectorizer
count_vect_1_grams = CountVectorizer(ngram_range=(1, 2))
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print "Word list = ", word_list
print "text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices]



In [ ]:

    
from sklearn.feature_extraction.text import HashingVectorizer
hash_vect = HashingVectorizer(n_features=1000)
word_hashed = hash_vect.fit_transform(twenty_sci_news.data)
word_hashed.shape

Creating NumPy arrays

From lists to unidimensional arrays



In [ ]:

    
import numpy as np
# Transform a list into a uni-dimensional array
list_of_ints = [1,2,3]
Array_1 = np.array(list_of_ints)
Array_1



In [ ]:

    
Array_1[1] # let's output the second value



In [ ]:

    
type(Array_1)



In [ ]:

    
Array_1.dtype # Note: The default dtype depends on the system you're operating.

Controlling the memory size



In [ ]:

    
import numpy as np
Array_1.nbytes # Please note that on 64bit platforms the result will be 24.



In [ ]:

    
Array_1 = np.array(list_of_ints, dtype= 'int8')



In [ ]:

    
Array_1b = Array_1.astype('float32')
Array_1b

Heterogeneous lists



In [ ]:

    
import numpy as np
complex_list = [1,2,3] + [1.,2.,3.] + ['a','b','c']
Array_2 = np.array(complex_list[:3]) # at first the input list is just ints
print 'complex_list[:3]', Array_2.dtype
Array_2 = np.array(complex_list[:6]) # then it is ints and floats
print 'complex_list[:6]', Array_2.dtype
Array_2 = np.array(complex_list) # finally we add strings
print 'complex_list[:] ',Array_2.dtype



In [ ]:

    
# Check if a NumPy array is of the desired numeric type
print isinstance(Array_2[0],np.number)

From lists to multidimensional arrays



In [ ]:

    
import numpy as np
# Transform a list into a bidimensional array
a_list_of_lists = [[1,2,3],[4,5,6],[7,8,9]]
Array_2D = np.array(a_list_of_lists )
Array_2D



In [ ]:

    
Array_2D[1,1]



In [ ]:

    
# Transform a list into a multi-dimensional array
a_list_of_lists_of_lists = [[[1,2],[3,4],[5,6]],
[[7,8],[9,10],[11,12]]]
Array_3D = np.array(a_list_of_lists_of_lists)
Array_3D



In [ ]:

    
Array_3D[0,2,0] # Accessing the 5th element



In [ ]:

    
np.array({1:2,3:4,5:6}.items())

Resizing arrays



In [ ]:

    
import numpy as np
# Restructuring a NumPy array shape
original_array = np.array([1, 2, 3, 4, 5, 6, 7, 8])
Array_a = original_array.reshape(4,2)
Array_b = original_array.reshape(4,2).copy()
Array_c = original_array.reshape(2,2,2)
# Attention because reshape creates just views, not copies
original_array[0] = -1



In [ ]:

    
Array_a



In [ ]:

    
Array_c



In [ ]:

    
Array_b



In [ ]:

    
original_array.resize(4,2)
original_array



In [ ]:

    
original_array.shape = (4,2)



In [ ]:

    
original_array

Arrays derived from NumPy functions



In [ ]:

    
import numpy as np
ordinal_values = np.arange(9).reshape(3,3)
ordinal_values



In [ ]:

    
np.arange(9)[::-1]



In [ ]:

    
np.random.randint(low=1,high=10,size=(3,3)).reshape(3,3)



In [ ]:

    
np.zeros((3,3))



In [ ]:

    
np.ones((3,3))



In [ ]:

    
np.eye(3)



In [ ]:

    
fractions = np.linspace(start=0, stop=1, num=10)
fractions



In [ ]:

    
growth = np.logspace(start=0, stop=1, num=10, base=10.0)
growth



In [ ]:

    
std_gaussian = np.random.normal(size=(3,3))
std_gaussian



In [ ]:

    
gaussian = np.random.normal(loc=1.0, scale= 3.0, size=(3,3))
gaussian



In [ ]:

    
np.random.uniform(low=0.0, high=1.0, size=(3,3))

Getting an array directly from a file



In [ ]:

    
import numpy as np
housing = np.loadtxt('regression-datasets-housing.csv',delimiter=',', dtype=float)



In [ ]:

    
np.loadtxt('datasets-uci-iris.csv',delimiter=',',dtype=float)

Extracting data from pandas



In [ ]:

    
import pandas as pd
import numpy as np
housing_filename = 'regression-datasets-housing.csv'
housing = pd.read_csv(housing_filename, header=None)



In [ ]:

    
housing_array = housing.values
housing_array.dtype



In [ ]:

    
housing.dtypes

NumPy fast operation and computations



In [ ]:

    
import numpy as np
a = np.arange(5).reshape(1,5)
a += 1
a*a



In [ ]:

    
a = np.arange(5).reshape(1,5) + 1
b = np.arange(5).reshape(5,1) + 1
a * b



In [ ]:

    
a2 = np.array([1,2,3,4,5] * 5).reshape(5,5)
b2 = a2.T
a2 * b2



In [ ]:

    
print a2



In [ ]:

    
np.sum(a2, axis=0)



In [ ]:

    
np.sum(a2, axis=1)



In [ ]:

    
%timeit -n 1 -r 3 [i+1.0 for i in range(10**6)]
%timeit -n 1 -r 3 np.arange(10**6)+1.0



In [ ]:

    
import math
%timeit -n 1 -r 3 [math.sqrt(i) for i in range(10**6)]



In [ ]:

    
%timeit -n 1 -r 3 np.sqrt(np.arange(10**6))

Matrix operations



In [ ]:

    
import numpy as np
M = np.arange(5*5, dtype=float).reshape(5,5)
M



In [ ]:

    
coefs = np.array([1., 0.5, 0.5, 0.5, 0.5])
coefs_matrix = np.column_stack((coefs,coefs[::-1]))
print coefs_matrix



In [ ]:

    
np.dot(M,coefs)



In [ ]:

    
np.dot(coefs,M)



In [ ]:

    
np.dot(M,coefs_matrix)

Slicing and indexing with NumPy arrays



In [ ]:

    
import numpy as np
M = np.arange(10*10, dtype=int).reshape(10,10)



In [ ]:

    
M[2:9:2,:]



In [ ]:

    
M[2:9:2,5:]



In [ ]:

    
M[2:9:2,5::-1]



In [ ]:

    
# In the book the output of this cell is wrong.
# Here is reported the correct output.

row_index = (M[:,0]>=20) & (M[:,0]<=80)
col_index = M[0,:]>=5
M[row_index,:][:,col_index]



In [ ]:

    
mask = (M>=20) & (M<=90) & ((M / 10.) % 1 >= 0.5)
M[mask]



In [ ]:

    
row_index = [1,1,2,7]
col_index = [0,2,4,8]



In [ ]:

    
M[row_index,col_index]



In [ ]:

    
M[row_index,:][:,col_index]



In [ ]:

    
N = M[2:9:2,5:].copy()

Stacking NumPy arrays



In [ ]:

    
import numpy as np
dataset = np.arange(10*5).reshape(10,5)



In [ ]:

    
single_line = np.arange(1*5).reshape(1,5)
a_few_lines = np.arange(3*5).reshape(3,5)



In [ ]:

    
np.vstack((dataset,single_line))



In [ ]:

    
np.vstack((dataset,a_few_lines))



In [ ]:

    
np.vstack((dataset,single_line,single_line))



In [ ]:

    
bias = np.ones(10).reshape(10,1)
np.hstack((dataset,bias))



In [ ]:

    
bias = np.ones(10)
np.column_stack((dataset,bias))



In [ ]:

    
np.dstack((dataset*1,dataset*2,dataset*3))



In [ ]:

    
np.insert(dataset, 3, bias, axis=1)



In [ ]:

    
np.insert(dataset, 3, dataset.T, axis=1)



In [ ]:

    
np.insert(dataset, 3, np.ones(5), axis=0)

	sepal_length	sepal_width	petal_length	petal_width	target
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

	sepal_length	sepal_width	petal_length	petal_width	target
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa

	sepal_length	sepal_width	petal_length	petal_width	target
145	6.7	3.0	5.2	2.3	Iris-virginica
146	6.3	2.5	5.0	1.9	Iris-virginica
147	6.5	3.0	5.2	2.0	Iris-virginica
148	6.2	3.4	5.4	2.3	Iris-virginica
149	5.9	3.0	5.1	1.8	Iris-virginica

	sepal_length	sepal_width
0	5.1	3.5
1	4.9	3.0
2	4.7	3.2
3	4.6	3.1
4	5.0	3.6
5	5.4	3.9
6	4.6	3.4
7	5.0	3.4
8	4.4	2.9
9	4.9	3.1
10	5.4	3.7
11	4.8	3.4
12	4.8	3.0
13	4.3	3.0
14	5.8	4.0
15	5.7	4.4
16	5.4	3.9
17	5.1	3.5
18	5.7	3.8
19	5.1	3.8
20	5.4	3.4
21	5.1	3.7
22	4.6	3.6
23	5.1	3.3
24	4.8	3.4
25	5.0	3.0
26	5.0	3.4
27	5.2	3.5
28	5.2	3.4
29	4.7	3.2
...	...	...
120	6.9	3.2
121	5.6	2.8
122	7.7	2.8
123	6.3	2.7
124	6.7	3.3
125	7.2	3.2
126	6.2	2.8
127	6.1	3.0
128	6.4	2.8
129	7.2	3.0
130	7.4	2.8
131	7.9	3.8
132	6.4	2.8
133	6.3	2.8
134	6.1	2.6
135	7.7	3.0
136	6.3	3.4
137	6.4	3.1
138	6.0	3.0
139	6.9	3.1
140	6.7	3.1
141	6.9	3.1
142	5.8	2.7
143	6.8	3.2
144	6.7	3.3
145	6.7	3.0
146	6.3	2.5
147	6.5	3.0
148	6.2	3.4
149	5.9	3.0

	Date	Temperature_city_1	Temperature_city_2	Temperature_city_3	Which_destination
0	20140910	80.0	32.0	40	1
1	20140911	100.0	50.0	36	2
2	20140912	102.0	55.0	46	1
3	20140913	60.0	20.0	35	3
4	20140914	60.0	NaN	32	3
5	20140915	NaN	57.0	42	2

	Date	Temperature_city_1	Temperature_city_2	Temperature_city_3	Which_destination
0	2014-09-10	80.0	32.0	40	1
1	2014-09-11	100.0	50.0	36	2
2	2014-09-12	102.0	55.0	46	1
3	2014-09-13	60.0	20.0	35	3
4	2014-09-14	60.0	NaN	32	3
5	2014-09-15	NaN	57.0	42	2

	Col1	Col2	Col3	Col4
0	0	1.0	1.0	Hello World!
1	1	1.0	1.0	Hello World!
2	2	1.0	1.0	Hello World!
3	3	1.0	1.0	Hello World!
4	4	1.0	1.0	Hello World!

	sepal_length	sepal_width	petal_length	petal_width
target
Iris-setosa	5.006	3.418	1.464	0.244
Iris-versicolor	5.936	2.770	4.260	1.326
New label	6.588	2.974	5.552	2.026

	sepal_length	sepal_width	petal_length	petal_width
target
Iris-setosa	0.124249	0.145180	0.030106	0.011494
Iris-versicolor	0.266433	0.098469	0.220816	0.039106
New label	0.404343	0.104004	0.304588	0.075433

	cloudy	foggy	rainy	snowy	sunny
0	0.0	0.0	0.0	0.0	1.0
1	1.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	1.0	0.0
3	0.0	0.0	1.0	0.0	0.0
4	0.0	1.0	0.0	0.0	0.0

	sepal_length	sepal_width
0	5.1	3.5
1	4.9	3.0
2	4.7	3.2
3	4.6	3.1
4	5.0	3.6
5	5.4	3.9
6	4.6	3.4
7	5.0	3.4
8	4.4	2.9
9	4.9	3.1
10	5.4	3.7
11	4.8	3.4
12	4.8	3.0
13	4.3	3.0
14	5.8	4.0
15	5.7	4.4
16	5.4	3.9
17	5.1	3.5
18	5.7	3.8
19	5.1	3.8
20	5.4	3.4
21	5.1	3.7
22	4.6	3.6
23	5.1	3.3
24	4.8	3.4
25	5.0	3.0
26	5.0	3.4
27	5.2	3.5
28	5.2	3.4
29	4.7	3.2
...	...	...
120	6.9	3.2
121	5.6	2.8
122	7.7	2.8
123	6.3	2.7
124	6.7	3.3
125	7.2	3.2
126	6.2	2.8
127	6.1	3.0
128	6.4	2.8
129	7.2	3.0
130	7.4	2.8
131	7.9	3.8
132	6.4	2.8
133	6.3	2.8
134	6.1	2.6
135	7.7	3.0
136	6.3	3.4
137	6.4	3.1
138	6.0	3.0
139	6.9	3.1
140	6.7	3.1
141	6.9	3.1
142	5.8	2.7
143	6.8	3.2
144	6.7	3.3
145	6.7	3.0
146	6.3	2.5
147	6.5	3.0
148	6.2	3.4
149	5.9	3.0

	cloudy	foggy	rainy	snowy	sunny
0	0.0	0.0	0.0	0.0	1.0
1	1.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	1.0	0.0
3	0.0	0.0	1.0	0.0	0.0
4	0.0	1.0	0.0	0.0	0.0

	sepal_length	sepal_width
0	5.1	3.5
1	4.9	3.0
2	4.7	3.2
3	4.6	3.1
4	5.0	3.6
5	5.4	3.9
6	4.6	3.4
7	5.0	3.4
8	4.4	2.9
9	4.9	3.1
10	5.4	3.7
11	4.8	3.4
12	4.8	3.0
13	4.3	3.0
14	5.8	4.0
15	5.7	4.4
16	5.4	3.9
17	5.1	3.5
18	5.7	3.8
19	5.1	3.8
20	5.4	3.4
21	5.1	3.7
22	4.6	3.6
23	5.1	3.3
24	4.8	3.4
25	5.0	3.0
26	5.0	3.4
27	5.2	3.5
28	5.2	3.4
29	4.7	3.2
...	...	...
120	6.9	3.2
121	5.6	2.8
122	7.7	2.8
123	6.3	2.7
124	6.7	3.3
125	7.2	3.2
126	6.2	2.8
127	6.1	3.0
128	6.4	2.8
129	7.2	3.0
130	7.4	2.8
131	7.9	3.8
132	6.4	2.8
133	6.3	2.8
134	6.1	2.6
135	7.7	3.0
136	6.3	3.4
137	6.4	3.1
138	6.0	3.0
139	6.9	3.1
140	6.7	3.1
141	6.9	3.1
142	5.8	2.7
143	6.8	3.2
144	6.7	3.3
145	6.7	3.0
146	6.3	2.5
147	6.5	3.0
148	6.2	3.4
149	5.9	3.0

	cloudy	foggy	rainy	snowy	sunny
0	0.0	0.0	0.0	0.0	1.0
1	1.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	1.0	0.0
3	0.0	0.0	1.0	0.0	0.0
4	0.0	1.0	0.0	0.0	0.0